Genome-Wide Chromatin Accessibility Matrix

Author

Jialei Duan

Published

Wed Sep 21, 2022 17:57:23-05:00

Doi


[1] "2022-09-21 17:57:13 CDT"
[1] "America/Chicago"

Preparation

PROJECT_DIR <- file.path(
    "/Users/jialei/Dropbox/Data/Projects/UTSW",
    "/Cardiomyopathy/atac-seq"
)

Functions

Load required packages.

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.3.2.9000 ──
## ✔ ggplot2   3.3.6.9000        ✔ dplyr     1.0.99.9000  
## ✔ tibble    3.1.8.9001        ✔ stringr   1.4.1.9000   
## ✔ tidyr     1.2.1.9000        ✔ forcats   0.5.2.9000   
## ✔ readr     2.1.2.9000        ✔ lubridate 1.8.0.9000   
## ✔ purrr     0.9000.0.9000     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(Matrix)
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(patchwork)
library(extrafont)
## Registering fonts with R
source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)
`%+replace%` <- ggplot2::`%+replace%`

Python env

np <- reticulate::import("numpy", convert = TRUE)
cat("numpy version:", np$`__version__`, "\n")
numpy version: 1.22.4 
reticulate::py_config()
python:         /Users/jialei/.pyenv/shims/python
libpython:      /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/libpython3.9.dylib
pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10:/Users/jialei/.pyenv/versions/mambaforge-4.10.3-10
version:        3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33)  [Clang 13.0.1 ]
numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
numpy_version:  1.22.4
numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy

NOTE: Python version was forced by RETICULATE_PYTHON

Preprocessing

Matrix

# load matrix
feature_counts_files <- list.files(
    path = file.path(PROJECT_DIR, "defined_peaks/summarize_counts")
)

feature_counts_files <- feature_counts_files[
    stringr::str_detect(feature_counts_files, pattern = "merged.+.bz2")
]

matrix_readcount_use <- purrr::map(feature_counts_files, \(x) {
    dt <- data.table::fread(
        file = file.path(PROJECT_DIR, "defined_peaks/summarize_counts", x)
    )

    features <- dt[["Geneid"]]
    dt <- dt[, .SD, .SDcols = patterns("/project")]
    colnames(dt) <- stringr::str_remove(
        string = colnames(dt),
        pattern = "_atacseq/alignments/Aligned_sorted_deduped_q10.bam"
    ) |>
        stringr::str_remove(pattern = "^.+/")

    m <- as.matrix(dt, sparse = TRUE)
    rownames(m) <- features

    return(m)
})

matrix_readcount_use <- purrr::reduce(matrix_readcount_use, cbind)
dim(matrix_readcount_use)
[1] 207021     87
# rename samples
sample_ids <- tibble::tribble(
    ~old, ~new,
    "heart_fresh1_1", "F1_1",
    "heart_fresh1_2", "F1_2",
    "heart_fresh2_1", "F2_1",
    "heart_fresh2_2", "F2_2",
    "heart_fresh5_1", "F5_1",
    "heart_fresh5_2", "F5_2",
    "heart_myectomy_p11_1", "HOCM11_1",
    "heart_myectomy_p11_2", "HOCM11_2",
    "heart_myectomy_p7_1", "HOCM7_1",
    "heart_myectomy_p7_2", "HOCM7_2",
    #
    "heart_myectomy_p4_2", "MYEC4_2"
)

colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) |>
    tibble::enframe(value = "sample") |>
    dplyr::left_join(sample_ids, by = c("sample" = "old")) |>
    dplyr::mutate(
        new = case_when(
            is.na(new) ~ sample,
            TRUE ~ new
        ),
        new = stringr::str_remove(string = new, pattern = "heart_"),
        #
        new = case_when(
            stringr::str_detect(
                string = new,
                pattern = "[hocm|Hocm|MYEC]"
            ) ~ stringr::str_to_upper(new),
            TRUE ~ stringr::str_to_title(new)
        )
    ) |>
    dplyr::pull(new)
# inspect matrix
matrix_readcount_use[1:5, 1:8]
                F1_1 F1_2 F2_1 F2_2 F5_1 F5_2 P3_1 P3_2
1_181358_181567    4    3    5   14   10   11    8    1
1_183716_183885    4    4    6    9    2    7    3    5
1_183999_184321    3    3    9   15    2   11    5   10
1_191239_191880   19   17   24   28   14   28    9   10
1_267894_268128    5    2   10   21    8   14    2    8
# filter blocked regions
blocked_regions <- read.table(
    file = file.path(
        dirname(PROJECT_DIR),
        "misc/annotations",
        "ENCFF419RSJ.bed.gz"
    ),
    stringsAsFactors = FALSE
)

blocked_regions <- GenomicRanges::GRanges(
    blocked_regions[, 1],
    IRanges::IRanges(blocked_regions[, 2] + 1, blocked_regions[, 3])
)

if (ensembldb::seqlevelsStyle(blocked_regions) == "UCSC") {
    ensembldb::seqlevelsStyle(blocked_regions) <- "Ensembl"
}

peak_regions <- stringr::str_split(
    string = rownames(matrix_readcount_use),
    pattern = "_", simplify = TRUE
) |>
    as.data.frame()
peak_regions <- GenomicRanges::GRanges(
    peak_regions[, 1],
    IRanges::IRanges(
        as.numeric(peak_regions[, 2]) + 1,
        as.numeric(peak_regions[, 3])
    )
)

idy1 <- S4Vectors::queryHits(
    GenomicRanges::findOverlaps(peak_regions, blocked_regions)
)

idy2 <- grep("Y|MT|GL|KI", peak_regions)
idy <- unique(c(idy1, idy2))

matrix_readcount_use <- matrix_readcount_use[-idy, ]
dim(matrix_readcount_use)
[1] 206024     87
# sanity check
stopifnot(
    dim(matrix_readcount_use) == c(206024, 87)
)
# check memory usage
walk(list(matrix_readcount_use), \(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
89.8 MB

R session info

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.2.1 (2022-06-23)
 os       macOS Monterey 12.6
 system   aarch64, darwin21.6.0
 ui       unknown
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Chicago
 date     2022-09-21
 pandoc   2.19.2 @ /opt/homebrew/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package              * version       date (UTC) lib source
 AnnotationDbi          1.58.0        2022-04-26 [1] Bioconductor
 AnnotationFilter       1.20.0        2022-04-26 [1] Bioconductor
 assertthat             0.2.1         2019-03-21 [1] CRAN (R 4.2.0)
 Biobase                2.56.0        2022-04-26 [1] Bioconductor
 BiocFileCache          2.4.0         2022-04-26 [1] Bioconductor
 BiocGenerics           0.42.0        2022-04-26 [1] Bioconductor
 BiocIO                 1.6.0         2022-04-26 [1] Bioconductor
 BiocParallel           1.30.3        2022-06-05 [1] Bioconductor
 biomaRt                2.52.0        2022-04-26 [1] Bioconductor
 Biostrings             2.64.1        2022-08-18 [1] Bioconductor
 bit                    4.0.4         2020-08-04 [1] CRAN (R 4.2.0)
 bit64                  4.0.5         2020-08-30 [1] CRAN (R 4.2.0)
 bitops                 1.0-7         2021-04-24 [1] CRAN (R 4.2.0)
 blob                   1.2.3         2022-04-10 [1] CRAN (R 4.2.0)
 cachem                 1.0.6         2021-08-19 [1] CRAN (R 4.2.0)
 callr                  3.7.2         2022-08-22 [1] CRAN (R 4.2.1)
 cli                    3.4.0         2022-09-08 [1] CRAN (R 4.2.1)
 codetools              0.2-18        2020-11-04 [2] CRAN (R 4.2.1)
 colorspace             2.0-3         2022-02-21 [1] CRAN (R 4.2.0)
 crayon                 1.5.1         2022-03-26 [1] CRAN (R 4.2.0)
 curl                   4.3.2         2021-06-23 [1] CRAN (R 4.2.0)
 data.table             1.14.3        2022-09-12 [1] local
 DBI                    1.1.3         2022-06-18 [1] CRAN (R 4.2.0)
 dbplyr                 2.2.1         2022-06-27 [1] CRAN (R 4.2.1)
 DelayedArray           0.22.0        2022-04-26 [1] Bioconductor
 devtools               2.4.4.9000    2022-08-11 [1] Github (r-lib/devtools@c9696a6)
 digest                 0.6.29        2021-12-01 [1] CRAN (R 4.2.0)
 dplyr                * 1.0.99.9000   2022-09-21 [1] Github (tidyverse/dplyr@60f7a2d)
 ellipsis               0.3.2         2021-04-29 [1] CRAN (R 4.2.0)
 ensembldb              2.20.2        2022-06-16 [1] Bioconductor
 evaluate               0.16          2022-08-09 [1] CRAN (R 4.2.1)
 extrafont            * 0.18          2022-04-12 [1] CRAN (R 4.2.0)
 extrafontdb            1.0           2012-06-11 [1] CRAN (R 4.2.0)
 fansi                  1.0.3         2022-03-24 [1] CRAN (R 4.2.0)
 fastmap                1.1.0         2021-01-25 [1] CRAN (R 4.2.0)
 filelock               1.0.2         2018-10-05 [1] CRAN (R 4.2.0)
 forcats              * 0.5.2.9000    2022-08-20 [1] Github (tidyverse/forcats@bd319e0)
 fs                     1.5.2.9000    2022-08-24 [1] Github (r-lib/fs@238032f)
 generics               0.1.3         2022-07-05 [1] CRAN (R 4.2.1)
 GenomeInfoDb           1.32.4        2022-09-06 [1] Bioconductor
 GenomeInfoDbData       1.2.8         2022-04-22 [1] Bioconductor
 GenomicAlignments      1.32.1        2022-07-24 [1] Bioconductor
 GenomicFeatures        1.48.3        2022-05-31 [1] Bioconductor
 GenomicRanges          1.48.0        2022-04-26 [1] Bioconductor
 ggplot2              * 3.3.6.9000    2022-09-12 [1] Github (tidyverse/ggplot2@a58b48c)
 glue                   1.6.2.9000    2022-04-22 [1] Github (tidyverse/glue@d47d6c7)
 gtable                 0.3.1.9000    2022-09-01 [1] Github (r-lib/gtable@c1a7a81)
 hms                    1.1.2         2022-08-19 [1] CRAN (R 4.2.1)
 htmltools              0.5.3         2022-07-18 [1] CRAN (R 4.2.1)
 htmlwidgets            1.5.4         2022-08-23 [1] Github (ramnathv/htmlwidgets@400cf1a)
 httpuv                 1.6.6         2022-09-08 [1] CRAN (R 4.2.1)
 httr                   1.4.4         2022-08-17 [1] CRAN (R 4.2.1)
 IRanges                2.30.1        2022-08-18 [1] Bioconductor
 jsonlite               1.8.0         2022-02-22 [1] CRAN (R 4.2.0)
 KEGGREST               1.36.3        2022-07-12 [1] Bioconductor
 knitr                  1.40          2022-08-24 [1] CRAN (R 4.2.1)
 later                  1.3.0         2021-08-18 [1] CRAN (R 4.2.0)
 lattice                0.20-45       2021-09-22 [2] CRAN (R 4.2.1)
 lazyeval               0.2.2         2019-03-15 [1] CRAN (R 4.2.0)
 lifecycle              1.0.2.9000    2022-09-09 [1] Github (r-lib/lifecycle@a2666fc)
 lubridate            * 1.8.0.9000    2022-05-24 [1] Github (tidyverse/lubridate@0bb49b2)
 magrittr               2.0.3         2022-03-30 [1] CRAN (R 4.2.0)
 Matrix               * 1.5-1         2022-09-13 [1] CRAN (R 4.2.1)
 MatrixGenerics         1.8.1         2022-06-26 [1] Bioconductor
 matrixStats            0.62.0        2022-04-19 [1] CRAN (R 4.2.0)
 memoise                2.0.1         2021-11-26 [1] CRAN (R 4.2.0)
 mime                   0.12          2021-09-28 [1] CRAN (R 4.2.0)
 miniUI                 0.1.1.1       2018-05-18 [1] CRAN (R 4.2.0)
 munsell                0.5.0         2018-06-12 [1] CRAN (R 4.2.0)
 patchwork            * 1.1.2.9000    2022-08-20 [1] Github (thomasp85/patchwork@c14c960)
 pillar                 1.8.1         2022-08-19 [1] CRAN (R 4.2.1)
 pkgbuild               1.3.1         2021-12-20 [1] CRAN (R 4.2.0)
 pkgconfig              2.0.3         2019-09-22 [1] CRAN (R 4.2.0)
 pkgload                1.3.0         2022-06-27 [1] CRAN (R 4.2.1)
 png                    0.1-7         2013-12-03 [1] CRAN (R 4.2.0)
 prettyunits            1.1.1.9000    2022-04-22 [1] Github (r-lib/prettyunits@8706d89)
 processx               3.7.0         2022-07-07 [1] CRAN (R 4.2.1)
 profvis                0.3.7         2020-11-02 [1] CRAN (R 4.2.0)
 progress               1.2.2         2019-05-16 [1] CRAN (R 4.2.0)
 promises               1.2.0.1       2021-02-11 [1] CRAN (R 4.2.0)
 ProtGenerics           1.28.0        2022-04-26 [1] Bioconductor
 ps                     1.7.1         2022-06-18 [1] CRAN (R 4.2.0)
 purrr                * 0.9000.0.9000 2022-09-21 [1] Github (tidyverse/purrr@194b8ef)
 R.cache                0.16.0        2022-07-21 [1] CRAN (R 4.2.1)
 R.methodsS3            1.8.2         2022-06-13 [1] CRAN (R 4.2.0)
 R.oo                   1.25.0        2022-06-12 [1] CRAN (R 4.2.0)
 R.utils                2.12.0        2022-06-28 [1] CRAN (R 4.2.1)
 R6                     2.5.1.9000    2022-08-04 [1] Github (r-lib/R6@87d5e45)
 ragg                   1.2.2.9000    2022-09-12 [1] Github (r-lib/ragg@904e145)
 rappdirs               0.3.3         2021-01-31 [1] CRAN (R 4.2.0)
 Rcpp                   1.0.9         2022-07-08 [1] CRAN (R 4.2.1)
 RCurl                  1.98-1.8      2022-07-30 [1] CRAN (R 4.2.1)
 readr                * 2.1.2.9000    2022-09-20 [1] Github (tidyverse/readr@5cac6ed)
 remotes                2.4.2         2022-09-12 [1] Github (r-lib/remotes@bc0949d)
 restfulr               0.0.15        2022-06-16 [1] CRAN (R 4.2.0)
 reticulate             1.26          2022-08-31 [1] CRAN (R 4.2.1)
 rjson                  0.2.21        2022-01-09 [1] CRAN (R 4.2.0)
 rlang                  1.0.5.9000    2022-09-16 [1] Github (r-lib/rlang@5a25ff4)
 rmarkdown              2.16.1        2022-08-25 [1] Github (rstudio/rmarkdown@b8a9879)
 Rsamtools              2.12.0        2022-04-26 [1] Bioconductor
 RSQLite                2.2.17        2022-09-10 [1] CRAN (R 4.2.1)
 rtracklayer            1.56.1        2022-06-23 [1] Bioconductor
 Rttf2pt1               1.3.10        2022-02-07 [1] CRAN (R 4.2.0)
 S4Vectors              0.34.0        2022-04-26 [1] Bioconductor
 scales                 1.2.1.9000    2022-08-20 [1] Github (r-lib/scales@b3df2fb)
 sessioninfo            1.2.2         2021-12-06 [1] CRAN (R 4.2.0)
 shiny                  1.7.2         2022-07-19 [1] CRAN (R 4.2.1)
 stringi                1.7.8         2022-07-11 [1] CRAN (R 4.2.1)
 stringr              * 1.4.1.9000    2022-08-21 [1] Github (tidyverse/stringr@792bc92)
 styler               * 1.7.0.9002    2022-09-21 [1] Github (r-lib/styler@1f4437b)
 SummarizedExperiment   1.26.1        2022-04-29 [1] Bioconductor
 systemfonts            1.0.4         2022-02-11 [1] CRAN (R 4.2.0)
 textshaping            0.3.6         2021-10-13 [1] CRAN (R 4.2.0)
 tibble               * 3.1.8.9001    2022-09-20 [1] Github (tidyverse/tibble@b740af1)
 tidyr                * 1.2.1.9000    2022-09-09 [1] Github (tidyverse/tidyr@653def2)
 tidyselect             1.1.2.9000    2022-09-21 [1] Github (r-lib/tidyselect@edd0a3b)
 tidyverse            * 1.3.2.9000    2022-09-12 [1] Github (tidyverse/tidyverse@3be8283)
 tzdb                   0.3.0         2022-03-28 [1] CRAN (R 4.2.0)
 urlchecker             1.0.1         2021-11-30 [1] CRAN (R 4.2.0)
 usethis                2.1.6.9000    2022-09-15 [1] Github (r-lib/usethis@7c34252)
 utf8                   1.2.2         2021-07-24 [1] CRAN (R 4.2.0)
 vctrs                  0.4.1.9000    2022-09-19 [1] Github (r-lib/vctrs@0a219ba)
 withr                  2.5.0         2022-03-03 [1] CRAN (R 4.2.0)
 xfun                   0.33          2022-09-12 [1] CRAN (R 4.2.1)
 XML                    3.99-0.10     2022-06-09 [1] CRAN (R 4.2.0)
 xml2                   1.3.3         2021-11-30 [1] CRAN (R 4.2.0)
 xtable                 1.8-4         2019-04-21 [1] CRAN (R 4.2.0)
 XVector                0.36.0        2022-04-26 [1] Bioconductor
 yaml                   2.3.5         2022-02-21 [1] CRAN (R 4.2.0)
 zlibbioc               1.42.0        2022-04-26 [1] Bioconductor

 [1] /opt/homebrew/lib/R/4.2/site-library
 [2] /opt/homebrew/Cellar/r/4.2.1_4/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/jialei/.pyenv/shims/python
 libpython:      /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/libpython3.9.dylib
 pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10:/Users/jialei/.pyenv/versions/mambaforge-4.10.3-10
 version:        3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33)  [Clang 13.0.1 ]
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
 numpy_version:  1.22.4
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
 
 NOTE: Python version was forced by RETICULATE_PYTHON

──────────────────────────────────────────────────────────────────────────────